if (!require(tidyverse)) {
install.packages("tidyverse")}
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
if (!require(data.table)) {
install.packages("data.table")}
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 4.1.1
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
if (!require(lubridate)) {
install.packages("lubridate")
}
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 4.1.1
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
air_2004 <- data.table::fread("2004_air.csv")
air_2019 <- data.table::fread("2019_air.csv")
dim(air_2004)
## [1] 19233 20
dim(air_2019)
## [1] 53086 20
str(air_2004)
## Classes 'data.table' and 'data.frame': 19233 obs. of 20 variables:
## $ Date : chr "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Daily Mean PM2.5 Concentration: num 8.9 12.2 16.5 18.1 11.5 32.5 14 29.9 21 15.7 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 37 51 60 64 48 94 55 88 70 59 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88502 88502 88101 88502 88502 88101 88502 88502 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(air_2019)
## Classes 'data.table' and 'data.frame': 53086 obs. of 20 variables:
## $ Date : chr "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Daily Mean PM2.5 Concentration: num 5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 24 50 68 86 47 11 12 29 13 30 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
#table(air_2004$Date)
#table(air_2019$Date)
summary(air_2004$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.10 6.00 10.10 13.13 16.30 251.00
summary(air_2019$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.200 4.000 6.500 7.734 9.900 120.900
##Checking to see if any NA's exist
mean(is.na(air_2004$`Daily Mean PM2.5 Concentration`))
## [1] 0
mean(is.na(air_2019$`Daily Mean PM2.5 Concentration`))
## [1] 0
air_2004_2019 <- rbind(air_2004,air_2019)
#air_2004_2019$Year <- format(as.Date(air_2004_2019$Date, format="%m/%d/%Y"),"%Y")
air_2004_2019 <- mutate(air_2004_2019, year = factor(rep(c(2004, 2019), c(nrow(air_2004), nrow(air_2019)))))
colnames(air_2004_2019)[which(names(air_2004_2019) == "Daily Mean PM2.5 Concentration")] <- "PM2.5"
colnames(air_2004_2019)[which(names(air_2004_2019) == "SITE_LATITUDE")] <- "lat"
colnames(air_2004_2019)[which(names(air_2004_2019) == "SITE_LONGITUDE")] <- "lon"
air_2004_2019 <- air_2004_2019[PM2.5>=0][order(PM2.5)]
#summary(air_2004_2019$PM2.5)
#gplot(air_2004_2019) +
#geom_histogram(mapping = aes(x = PM2.5))
#Part 3
#Create Leaflet map showing the locations of the sites
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.1.1
temp.pal <- colorFactor(palette ='Accent', domain=air_2004_2019$year)
leaflet(air_2004_2019) %>%
addProviderTiles('CartoDB.Positron') %>%
addCircles(
lat = ~lat, lng=~lon,
# HERE IS OUR PAL!
label = air_2004_2019$`Site Name`, color = ~ temp.pal(air_2004_2019$year),
opacity = 1, fillOpacity = 1, radius = 500
) %>%
# And a pretty legend
addLegend('bottomleft', pal=temp.pal, values=air_2004_2019$year,
title='Year', opacity=1)
# addMarkers(air_2004_2019$lon,air_2004_2019$lat, rank(-air_2004_2019$PM2.5) <= 10)
*From the leaflet map, we can see about an even distribution of sites from the 2004 data set and the 2019 data set. There does seem to be a bit more 2004 monitoring sites in comparison to the the 2019 sites. All of the sites seem to be mainly located on the coast of California rather near the borders of the state. #Part 4
#Check for missing or implausible values of PM2.5 in the combined dataset
#table(air_2004_2019$PM2.5)
summary(air_2004_2019$PM2.5)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 4.400 7.300 9.208 11.300 251.000
#Part 5
whether daily concentrations of PM (particulate matter air pollution with aerodynamic diameter less than 2.5 m) have decreased in California over the last 15 years (from 2004 to 2019)
##State
library(ggplot2)
air_2004_2019[!is.na(PM2.5)] %>%
ggplot()+
geom_boxplot(mapping=aes(x=year, y=log2(PM2.5), fill=year)) + labs(title = "PM2.5 Levels in California in 2004 and in 2019") + labs(x = "Year", y = "log2([PM2.5])")
## Warning: Removed 81 rows containing non-finite values (stat_boxplot).
with(air_2004_2019, tapply(PM2.5, year, summary))
## $`2004`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 10.10 13.13 16.30 251.00
##
## $`2019`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 4.00 6.50 7.78 10.00 120.90
avg_pm_county <- group_by(air_2004_2019, year, COUNTY) %>% summarize(PM_AVG = mean(PM2.5, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
tail(avg_pm_county)
## # A tibble: 6 x 3
## # Groups: year [1]
## year COUNTY PM_AVG
## <fct> <chr> <dbl>
## 1 2019 Sutter 8.27
## 2 2019 Tehama 5.50
## 3 2019 Trinity 4.94
## 4 2019 Tulare 11.2
## 5 2019 Ventura 6.67
## 6 2019 Yolo 6.85
temp.pal <- colorFactor(palette ='Accent', domain=avg_pm_county$COUNTY)
ggplot(data=avg_pm_county, aes(x=as.numeric(as.character(year)), y=PM_AVG, group=COUNTY)) +
geom_line(color=temp.pal(avg_pm_county$COUNTY))+
geom_point(color=temp.pal(avg_pm_county$COUNTY))+
labs(title = "Average PM2.5 Levels Between 2004 to 2019 in All Counties") + labs(x = "Year", y = "Average [PM2.5]")
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Accent is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Accent is 8
## Returning the palette you asked for with that many colors
* While this graph, isn’t the most descriptive, it does get the point across that the Average PM2.5 concentration for each county decreased over the years.
sites <- filter(air_2004_2019, COUNTY_CODE == 37) %>% select(COUNTY_CODE, `Site ID`, year) %>% unique
sites <- mutate(sites, site.code = paste(COUNTY_CODE, `Site ID`, sep = "."))
str(sites)
## Classes 'data.table' and 'data.frame': 24 obs. of 4 variables:
## $ COUNTY_CODE: int 37 37 37 37 37 37 37 37 37 37 ...
## $ Site ID : int 60376012 60379033 60379034 60371602 60379034 60371201 60379033 60374004 60372005 60370002 ...
## $ year : Factor w/ 2 levels "2004","2019": 2 2 1 2 2 2 1 2 2 2 ...
## $ site.code : chr "37.60376012" "37.60379033" "37.60379034" "37.60371602" ...
## - attr(*, ".internal.selfref")=<externalptr>
site.year <- with(sites, split(site.code, year))
both <- intersect(site.year[[1]], site.year[[2]])
print(both)
## [1] "37.60379034" "37.60379033" "37.60370002" "37.60371103" "37.60371201"
## [6] "37.60374004" "37.60372005" "37.60374002"
count <- mutate(air_2004_2019, site.code = paste(COUNTY_CODE, `Site ID`, sep = ".")) %>% filter(site.code %in% both)
group_by(count, site.code) %>% summarize(n = n())
## # A tibble: 8 x 2
## site.code n
## <chr> <int>
## 1 37.60370002 400
## 2 37.60371103 1401
## 3 37.60371201 586
## 4 37.60372005 288
## 5 37.60374002 484
## 6 37.60374004 1043
## 7 37.60379033 468
## 8 37.60379034 219
#focus on Site ID 60371103
air_60371103 <- filter(air_2004_2019, COUNTY_CODE == "37" & `Site ID` == "60371103") %>% select(Date, year, PM2.5) %>% mutate(Date = format(as.Date(Date,format="%Y/%m/%d")), yday = format(as.Date(Date,format="%Y/%d")))
qplot(Date, PM2.5, data = air_60371103, facets = . ~ year, xlab = "Day of the year")
* From the graph, we can see that the PM2.5 Concentration did in fact decrease as noted in previous plots. This shows is very clearly as it maps the trends of the PM25 concentration by the days of the year of each year. However, what is interesting, is that while the 2004 points seem to be decreasing over the year, the 2019 dataset seems to be increasing over the year which can be attributed to a multitude of things, namely the rapid increase in fires in California specificly, but more generally, the increase in prescence of things like power plants and automobiles over of the years.